*===============================================================================
*
*					WORKER BELIEFS ABOUT OUTSIDE OPTIONS
*		(c)	Simon Jaeger, Christopher Roth, Nina Roussille, Benjamin Schoefer
*							  2023 December 5
*						   	   SOEP-IAB Data 
*
*===============================================================================


********************************************************************************
*							Finalize Data Clean	 							   *
********************************************************************************

cap log close                            
log using ${log}/7_finalize.log, replace  
set seed 6000



*** looping over sample restrictions

* "" means no coworker sample restrictions
* "_unemp" means restricting to job switches intermediated by an unemployment spell
* "movers20" means restricting to firms with at least 20 movers over the period 2015-2019
* "_socc" means defining coworker wage changes at the firm-occupation level, and restricting to movers switching within occupation
* "_sedu" means defining coworker wage changes at the firm-education category level
* "_sage" means defining coworker wage changes at the firm-age bracket level
* "_sinc" means defining coworker wage changes at the firm-earnings quintile level

* "FTp" means restricting to fulltime-to-fulltime coworker switches (but keeping both full- and part-time SOEP respondents)
* "nz" means keeping only SOEP respondents who did not respond "zero" when asked about the wage change accompaying a switch to their outside option

foreach restriction in "" "_unemp" "movers20" "_socc" "_sedu" "_sage" "_sinc" {
	
	foreach addrest in "FTp" "nz" {
	
	if "`addrest'"=="FTp" | ("`restriction'"=="" & "`addrest'"=="nz") {
	cap noisily {

	di "`restriction' `addrest'" // printing for log file
	
	*** setting dataset prefixes based on location in the loop
	if "`restriction'"=="" local rprefix ""
	if "`restriction'"=="_unemp" local rprefix "u_"
	if "`restriction'"=="movers20" local rprefix "m20_"
	if "`restriction'"=="_socc" local rprefix "so_"
	if "`restriction'"=="_sedu" local rprefix "se_"
	if "`restriction'"=="_sage" local rprefix "sa_"
	if "`restriction'"=="_sinc" local rprefix "si_"
		
	if "`addrest'"=="FTp" local rprefix "FTp_`rprefix'" 
	if "`addrest'"=="nz" local rprefix "nz_`rprefix'"
	
	
		
	*** loading SOEP data
	use "$data/GSOEPIS2019_survey_long.dta", clear 
	append using "$data/GSOEPIS2020_survey_long.dta"
		
	rename syear jahr
	
	
	*** merging SOEP individual IDs onto IAB firm IDs
	di "merging on IAB firm IDs"
	merge 1:m pid jahr using ${data}/data_prep_ADIAB_pid_betnr_jahr.dta // this file created by 1_data_prep.do
	drop if _merge == 2
	drop _merge
	
	*** merging SOEP individuals onto their wage rank in their occupation
	di "merging on occupation rank"
	merge m:1 pid using "$data/within_occupation_rank_2019.dta" // this file created by 4_occupationrank.do
	drop if _merge == 2
	drop _merge 
		
	*** if we're in the 'intermediate unemployment' specification, merging on ML predictions
	* ML predictions aren't calculated for the non-'intermediate unemployment' specs
	if "`restriction'"=="_unemp"{
	
		* merging SOEP individuals onto ML predictions
		di "observation check merging on ML predictions `rprefix'"
		cap noisily merge m:1 pid using "$data/machinelearning_predictions`restriction'FT.dta", keep(master match)
		cap noisily drop _merge
		
		cap noisily replace p_delta_ln_wage_mover = . if jahr!=2019
		cap noisily su p_delta_ln_wage_mover, d
		cap noisily tab jahr if p_delta_ln_wage_mover!=.
				
		
		* merging SOEP individuals onto ML predictions calculated over only 2018-2019 data, for a robustness check
		di "merging on 2018-2019 ML predictions"
		cap noisily merge m:1 pid using "$data/machinelearning_predictions`restriction'FT1819.dta", keep(master match) nogen
		cap noisily replace p_delta_ln_wage_mover1819 = . if jahr!=2019
		cap noisily su p_delta_ln_wage_mover1819
				
		* winsorizing ML predictions
		rename p_delta_ln_wage_mover _p_delta_ln_wage_mover
		winsor _p_delta_ln_wage_mover, p(0.02) gen(p_delta_ln_wage_mover)
		drop _p_delta_ln_wage_mover
		rename p_delta_ln_wage_mover1819 _p_delta_ln_wage_mover1819
		winsor _p_delta_ln_wage_mover1819, p(0.02) gen(p_delta_ln_wage_mover1819)
		drop _p_delta_ln_wage_mover1819
		
		* merging SOEP IDs onto the IAB covariates used in the ML regressions, for plotting of beliefs/ML predictions against these covariates
		di "merging on IAB ML covariates"
		cap noisily merge m:1 pid using "$data/machinelearning_fullcovariates`restriction'FT.dta", keep(master match) nogen

		cap drop temprand
		
	}


	
	*** merging SOEP individuals onto IAB characteristics to enable merging them onto "wage changes of similar coworkers"
	* characteristics: occupation, education level, age bracket, earnings quintile
	di "merging on IAB characteristics for similar coworkers merge"
	cap noisily merge m:1 pid using "$data/gsoep_adminchars.dta", keep(master match)
	cap noisily drop _merge

	*** merging SOEP individuals onto coworker wage changes data
	di "merging on coworker wage changes"
	
	** name of the merge file depends on position in the loop
	
	* if we're in the "at least 20 coworker movers" loop
	if "`restriction'"=="movers20" {
		
		* merging SOEP individuals onto coworker wage changes using firm IDs
		di "observation check merging on coworker wage changes `rprefix'"
		merge m:1 betnr using "$data/coworker_wage_changesFT.dta"
		
		tab jahr if _merge==3
		
		drop if _merge == 2
		drop _merge
		
		* restricting to SOEP individuals with at least 20 coworker movers in the 2015-2019 sample period
		foreach year in 2015 2017 {
			foreach var in mean_delta_ln_wage_mover med_delta_ln_wage_mover mean_delta_ln_wage_mover_r0 ///
			mean_delta_ln_wage_mover_r1 med_delta_ln_wage_mover_r0 med_delta_ln_wage_mover_r1 ///
			ebayes_delta_ln_wage_mover {
				replace `var'_`year' = . if num_movers_`year'<20
				
			}
		}
	}
	
	* if we're not in the "at least 20 coworker movers" loop
	if "`restriction'"!="movers20" {
		
		* determining which list of variables to merge on
		if !inlist("`restriction'","_socc","_sedu","_sage","_sinc") local mergelist "betnr"
			
		* if we're restricting to similar coworkers, we merge on both the SOEP individual's firm ID and their characteristics
		if "`restriction'"=="_socc" local mergelist "betnr occ1"
		if "`restriction'"=="_sedu" local mergelist "betnr educ"
		if "`restriction'"=="_sage" local mergelist "betnr agecat"
		if "`restriction'"=="_sinc" local mergelist "betnr wage_quintile"
			
		* merging
		di "`restriction'"
		merge m:1 `mergelist' using "$data/coworker_wage_changes`restriction'FT.dta"
	
		drop if _merge == 2
		drop _merge
		
		* dropping SOEP respondents with zero expected wage change if that's the loop we're in
		if "`addrest'"=="nz" {
			
			foreach year in 2015 2017 {
				foreach var in mean_delta_ln_wage_mover med_delta_ln_wage_mover mean_delta_ln_wage_mover_r0 ///
					mean_delta_ln_wage_mover_r1 med_delta_ln_wage_mover_r0 med_delta_ln_wage_mover_r1 ///
					ebayes_delta_ln_wage_mover {
						replace `var'_`year' = . if monetary_surplus_pct==0
				}
				
			}
		
		}
	
	}
		
	* merging firm IDs onto AKM effects
	merge m:1 betnr using ${orig}/SOEP-ADIAB_7519_v1_akm_estab.dta

	* merging on coworker turnover and wage dispersion data (used in heterogeneity cuts)
	di "merging on coworker turnover data"
	cap noisily merge m:1 betnr using "$data/coworker_turnover.dta", keep(master match) nogen keepusing(turnover sd_wages) // created by 3_firm_turnover.do
	
	* merging on 'years of education' (rather than 3-category IAB education) variable, for use in heterogeneity cuts
	merge m:1 pid using "$data/GSOEPIS2019_survey.dta", keep(master match) keepusing(yearsedu) nogen
	
	* restrictions: keep only observations with non-missing outside option belief (happens before winsorization), and some nonmissing OO measure
	di "observation check beliefs `rprefix'"
	
	if "`restriction'"=="" keep if salary_switchout_change_mid!=.
	if "`restriction'"=="_unemp" keep if monetary_surplus_pct!=.
	if "`restriction'"!="" & "`restriction'"!="_unemp" keep if monetary_surplus_pct !=. | salary_switchout_change_mid!=.
	
	di "observation check OO `rprefix'"
	gen oo_nonm = 0
	replace oo_nonm = 1 if mean_delta_ln_wage_mover_2015!=.
	if "`restriction'"=="_unemp" replace oo_nonm = 1 if p_delta_ln_wage_mover!=.
	keep if oo_nonm==1

	
	
	*** winsorizing and creating new variables

	* winsorizing firm effects
	winsor feff_2010_2017, gen(firm_effect) p(0.02)
	sum firm_effect, detail
		
	* winsorizing beliefs about median salary in occupation
	winsor l_d_guess_median, gen(l_d_guess_median_w) p(0.02)
	
	* subjective probabilities about earning more, less, or the same at outside option
	gen prob = .
	replace prob = newpay_quit_more_proba if monetary_surplus_pct<0
	replace prob = newpay_quit_same_proba if monetary_surplus_pct==0
	replace prob = newpay_quit_less_proba if monetary_surplus_pct>0 
	
	* belief about own salary rank in occupation
	gen occupation_pctl_belief = (salary_less_proba + 100-salary_more_proba)/2
	cap noisily sum occupation_pctl_belief if feff_2010_2017!=., d
	quietly levelsof pid if occupation_pctl_belief!=. & feff_2010_2017!=.
	local distinct_observations = r(r) 
	if `distinct_observations'>=20 {
			display("occ rank belief based on observations from `distinct_observations' individuals")
	}
	
	* true salary rank in occupation, and corresponding error
	gen occupation_pctl_true = ieb_beruf_kons_num_ventile*5 - 2.5
	gen occupation_pctl_error = occupation_pctl_belief - occupation_pctl_true
	
	* beliefs about wage changes of coworker switchers
	gen ln_mover_change = ln(1+salary_switchout_change_mid/100)
	winsor ln_mover_change, gen(ln_mover_change_w) p(0.02)
	gen pct_mover_change = salary_switchout_change_mid/100
	winsor pct_mover_change, gen(pct_mover_change_w) p(0.02)

	* belief about wage change accompanying own switch to outside option
	gen ln_own_change = ln(1-monetary_surplus_pct_w2/100)
	winsor ln_own_change, gen(ln_own_change_w) p(0.02)
	gen pct_own_change = -monetary_surplus_pct_w2/100
	winsor pct_own_change, gen(pct_own_change_w) p(0.02)
	
	* continuous 'magnitude of negotiation' variable that recodes ranges to their midpoints
	cap drop negotiate_pct_c
	cap noisily {
		gen negotiate_pct_c = negotiate_pct
		recode negotiate_pct_c (1 = 0) (2 = 1) (3 = 3.5) (4 = 7.5) (5 = 12.5) (6 = 25)
	}
	* male dummy
	gen male = lsex==1	
		
	* winsorizing
	foreach yvar of varlist age  salary  tenure  ///
				   w_surplus  w_surplus_pct  monetary_surplus   monetary_surplus_pct   ///
				   amenity_surplus  amenity_surplus_pct  amenity_share  monetary_share  ///
					salary_switchout_change_mid	paycut_quit_pct{
			rename `yvar' _`yvar'
			winsor _`yvar', p(0.02) gen(`yvar')
			drop _`yvar'
	}
		
	* normalise all probabilities to [0, 1]
	foreach behavior in switchout_proba negotiate_proba negotiate_pct_c paycut_quit_pct{
		replace `behavior' = `behavior' / 100
	}
		
	* saving dataset of main sample (one obs per individual) for GSOEP/mover characteristic comparisons
	if "`restriction'"=="_unemp" & "`addrest'"=="FTp" {
		preserve
			
			keep if mean_delta_ln_wage_mover_2015!=.
			di _N
				
			keep pid
				
			sort pid
			cap drop dup
			quietly by pid: gen dup = cond(_N==1,0,_n)
			drop if dup>1
			drop dup
				
			save "$data/mainsample_ids.dta", replace
		restore
		
	}
	
	* saving dataset
	save "$data/`rprefix'_finaldata.dta", replace 	
	
	} // cap noisly
	
	} // if condition to restrict to relevant samples 

	} // addrest loop 
			
} // restriction loop 
	
* creating dataset for non-main analyses --- just the combination of the two main samples
use "$data/FTp_u__finaldata.dta", clear
gen sortvar = 0

append using "$data/FTp__finaldata.dta"
replace sortvar = 1 if sortvar==.

sort pid jahr sortvar

quietly by pid jahr: gen dup = cond(_N==1,0,_n)
di _N
drop if dup>1
drop dup sortvar

di _N

su *

save "$data/unionsample_finaldata.dta", replace	

* just list of person IDs in this sample
keep pid
sort pid
quietly by pid: gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup
save "$data/unionsample_ids.dta", replace



log close
clear
